#install.packages("nycflights13")

library(nycflights13)
library(dplyr)
# [NYC Flights] a)
flights_speed <- flights |> 
  select(carrier, air_time, distance) |> 
  mutate(speed = distance / (air_time / 60))  # Convert air_time to hours

head(flights_speed)  # Check the first few rows
## # A tibble: 6 × 4
##   carrier air_time distance speed
##   <chr>      <dbl>    <dbl> <dbl>
## 1 UA           227     1400  370.
## 2 UA           227     1416  374.
## 3 AA           160     1089  408.
## 4 B6           183     1576  517.
## 5 DL           116      762  394.
## 6 UA           150      719  288.
# [NYC Flights] b)
library(ggplot2)

ggplot(flights_speed, aes(x = carrier, y = speed)) +
  geom_boxplot() +
  labs(title = "Variation in Flight Speed Across Carriers",
       x = "Carrier",
       y = "Speed (miles per hour)") +
  theme_minimal()
## Warning: Removed 9430 rows containing non-finite outside the scale range
## (`stat_boxplot()`).

There is no large variance in the flight speeds accross carriers.

# [London Olympics] a)

library(tidyverse)

# Load the dataset
olympics <- read_csv("https://uwmadison.box.com/shared/static/rzw8h2x6dp5693gdbpgxaf2koqijo12l.csv")

# Compute average age per sport
age_summary <- olympics |> 
  group_by(Sport) |> 
  summarize(avg_age = mean(Age, na.rm = TRUE))

# Plot
ggplot(olympics, aes(x = Sport, y = Age)) +
  geom_jitter(alpha = 0.3, color = "blue", width = 0.2) +  # Individual ages
  geom_point(data = age_summary, aes(x = Sport, y = avg_age), color = "red", size = 3) +  # Average age
  coord_flip() +  # Flip for better readability
  labs(title = "Athlete Ages Across Sports in the 2012 London Olympics",
       x = "Sport",
       y = "Age") +
  theme_minimal()

# [London Olympics] b)

sorted_sports <- age_summary |> 
  arrange(avg_age)

print(sorted_sports)
## # A tibble: 42 × 2
##    Sport                          avg_age
##    <chr>                            <dbl>
##  1 Gymnastics - Rhythmic             19.5
##  2 Gymnastics - Artistic             22.1
##  3 Swimming                          22.4
##  4 Synchronised Swimming             22.8
##  5 Diving                            22.9
##  6 Cycling - BMX, Cycling - Track    23  
##  7 Football                          23.3
##  8 Cycling - BMX                     23.5
##  9 Boxing                            24.0
## 10 Taekwondo                         24.1
## # ℹ 32 more rows
# Not required but for better understanding:

ggplot(sorted_sports, aes(x = reorder(Sport, avg_age), y = avg_age)) +
  geom_col(fill = "steelblue") +
  coord_flip() +
  labs(title = "Average Age of Athletes by Sport",
       x = "Sport",
       y = "Average Age") +
  theme_minimal()

Question: Do male and female athletes have different average ages in certain sports? Why?

# [London Olympics] c)

# Compute average age by Sport and Gender
age_gender_summary <- olympics |> 
  group_by(Sport, Sex) |> 
  summarize(avg_age = mean(Age, na.rm = TRUE))

# Visualization
ggplot(age_gender_summary, aes(x = reorder(Sport, avg_age), y = avg_age, fill = Sex)) +
  geom_bar(stat = "identity", position = "dodge") +
  coord_flip() +
  labs(title = "Comparison of Average Age Between Male and Female Athletes by Sport",
       x = "Sport",
       y = "Average Age",
       fill = "Gender") +
  theme_minimal()

Some sports may require more experience (e.g., equestrian), while others favor younger athletes (e.g., gymnastics).

I can se bigger difference in the average age in equestrian and cycling - mountain bile, cycling - road, cycling track, and the reason behind that might be the differences in peak performance age. Another reason might be career longevity & injury risks. However, overall there are no major differences in the average ages between males and females in different sports.

# [Pokemon] a)

library(tidyverse)

pokemon <- read_csv("https://uwmadison.box.com/shared/static/hf5cmx3ew3ch0v6t0c2x56838er1lt2c.csv")

# Create the attack-to-defense ratio column
pokemon <- pokemon %>%
  mutate(attack_defense_ratio = Attack / Defense)

head(pokemon)
## # A tibble: 6 × 13
##   Name       type_1 type_2 Total    HP Attack Defense speed_attack speed_defense
##   <chr>      <chr>  <chr>  <dbl> <dbl>  <dbl>   <dbl>        <dbl>         <dbl>
## 1 Bulbasaur  Grass  Poison   318    45     49      49           65            65
## 2 Ivysaur    Grass  Poison   405    60     62      63           80            80
## 3 Venusaur   Grass  Poison   525    80     82      83          100           100
## 4 VenusaurM… Grass  Poison   625    80    100     123          122           120
## 5 Charmander Fire   <NA>     309    39     52      43           60            50
## 6 Charmeleon Fire   <NA>     405    58     64      58           80            65
## # ℹ 4 more variables: Speed <dbl>, Generation <dbl>, Legendary <lgl>,
## #   attack_defense_ratio <dbl>
# [Pokemon] b)

median_ratios <- pokemon %>%
  group_by(type_1) %>%
  summarize(median_ratio = median(attack_defense_ratio, na.rm = TRUE)) %>%
  arrange(desc(median_ratio))

head(median_ratios)
## # A tibble: 6 × 2
##   type_1   median_ratio
##   <chr>           <dbl>
## 1 Fighting         1.57
## 2 Dragon           1.38
## 3 Fire             1.33
## 4 Dark             1.29
## 5 Normal           1.23
## 6 Poison           1.15
# [Pokemon] c)

# Convert type_1 to a factor with levels ordered by median ratio
pokemon <- pokemon %>%
  mutate(type_1 = factor(type_1, levels = median_ratios$type_1))

# Plot attack vs defense with faceting by type_1
ggplot(pokemon, aes(x = Defense, y = Attack, color = type_1)) +
  geom_point(alpha = 0.6) +
  facet_wrap(~type_1, scales = "free") +
  theme_minimal() +
  labs(title = "Attack vs. Defense by Pokemon Type",
       x = "Defense",
       y = "Attack") +
  theme(legend.position = "none")

[Pokemon] d)

A dynamic visualization would enable users to interact with the data to filter, highlight, or compare Pokémon types.

Potential Questions: - How do Pokémon types compare in attack-to-defense ratio? Users can select a type from a dropdown to highlight it. - What are the strongest defensive and offensive Pokémon? Sliders can filter by attack or defense stats. - How do Legendary Pokémon compare to non-Legendary ones? A toggle can highlight only Legendary Pokémon.

Structure of Interaction:

Dropdown Menu: Highlight a specific Pokémon type. Sliders: Filter by Attack, Defense, or attack-to-defense ratio. Hover Tooltips: Display stats (e.g., name, attack, defense) on hover. Checkbox Toggle: Compare Legendary vs. non-Legendary Pokémon.

How the Display Updates: - Selecting a Pokémon type highlights it, with others becoming transparent. - Sliders dynamically filter Pokémon based on the selected range. - The plot updates to highlight Legendary Pokémon when toggled.

[Gene Expression Faceting]

# [Gene Expression Faceting] a)

library(ggplot2)
library(dplyr)
library(readr)

genes <- read_csv("https://uwmadison.box.com/shared/static/dwzchdtfca33r0f6i055k2d0939onnlv.csv")

ggplot(genes, aes(x = time, y = log1p(value))) +
  geom_point(alpha = 0.3, color = "black") +  # Semi-transparent black points
  facet_wrap(~gene, nrow = 2, ncol = 4) +  # Ensuring correct facet arrangement
  theme_minimal() +
  labs(y = "log(1 + value)", x = "Time") +
  theme(
    strip.background = element_rect(fill = "grey80"),  # Matching facet header style
    strip.text = element_text(face = "bold"),  # Making facet text bold
    axis.title = element_text(face = "bold"),  # Bold axis labels
    panel.grid.major = element_line(color = "grey90"),  # Light grey gridlines
    panel.grid.minor = element_blank()  # Removing minor gridlines
  )

[Gene Expression Faceting] b)

Strength of Small Multiples: One major strength of using small multiples is that they allow for easy comparison across multiple categories (in this case, genes). Since each gene has its own separate panel, it becomes much simpler to observe patterns, trends, and differences without overlapping data points from other genes. This clarity makes it easier to identify trends and anomalies in gene expression over time.

Weakness of Small Multiples: A notable weakness is that small multiples can become cluttered and difficult to interpret when dealing with a large number of categories. If too many genes were included, the individual panels would shrink, making it harder to discern patterns within each plot. Additionally, if the y-axes are not consistently scaled, comparing expression levels between genes might become misleading.

# [Gene Expression Faceting] c)

# Group by gene and rounded time, calculate mean expression value
gene_groups <- genes %>%
  group_by(gene, rounded_time = round(time, 2)) %>%
  summarise(mean_value = mean(value), .groups = "drop")

# Order genes by total expression (LEAST abundant first)
gene_order <- gene_groups %>%
  group_by(gene) %>%
  summarise(total_expression = sum(mean_value)) %>%
  arrange(total_expression) %>%  # Reverse the order
  pull(gene)

# Generate heatmap with wider aspect ratio
ggplot(gene_groups, aes(x = rounded_time, y = factor(gene, levels = gene_order), fill = log1p(mean_value))) +
  geom_tile() +
  scale_fill_gradient(low = "white", high = "#1F77B4") +  # Ensure the correct blue gradient
  theme_minimal(base_size = 14) +  
  labs(y = "Gene (Sorted by Expression)", x = "Time (Rounded)", fill = "log(1 + mean_value)") +
  theme(
    axis.text.x = element_text(size = 12, face = "bold"),  
    axis.text.y = element_text(size = 12, face = "bold"),
    axis.title.x = element_text(size = 14, face = "bold"),
    axis.title.y = element_text(size = 14, face = "bold"),
    panel.grid.major = element_blank(),  
    panel.grid.minor = element_blank(),
    plot.margin = margin(10, 20, 10, 10)  # Adjust spacing if needed
  ) 

[Gene Expression Faceting] d)

To modify the small multiples plot from part (a) and overlay the smooth fitted curve, we will:

# Load fitted values (smooth curve predictions)
fitted_values <- read_csv("https://go.wisc.edu/x678hu")

# Merge original data with fitted values
merged_data <- genes %>%
  left_join(fitted_values, by = c("gene", "time"))
## Warning in left_join(., fitted_values, by = c("gene", "time")): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 33 of `x` matches multiple rows in `y`.
## ℹ Row 141 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
# Create small multiples plot with fitted curves
ggplot() +
  geom_point(data = genes, aes(x = time, y = log1p(value)), alpha = 0.3, color = "black") +  # Semi-transparent points
  geom_line(data = fitted_values, aes(x = time, y = log1p(mu)), color = "darkred", size = 1.2) +  # Fitted smooth curves
  facet_wrap(~gene, nrow = 2, ncol = 4) +  # Ensure correct facet arrangement
  theme_minimal() +
  labs(y = "log(1 + value)", x = "Time") +
  theme(
    strip.background = element_rect(fill = "grey80"),  # Matching facet header style
    strip.text = element_text(face = "bold"),  # Making facet text bold
    axis.title = element_text(face = "bold"),  # Bold axis labels
    panel.grid.major = element_line(color = "grey90"),  # Light grey gridlines
    panel.grid.minor = element_blank(),  # Removing minor gridlines
    panel.border = element_rect(color = "black", fill = NA, size = 1.2)  # Adding a thin black border
  )
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## Warning: The `size` argument of `element_rect()` is deprecated as of ggplot2 3.4.0.
## ℹ Please use the `linewidth` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

How This Relates to the “Grammar of Graphics”:

Layering: The geom_point() layer represents raw gene expression data. The geom_line() layer overlays the fitted curve in red.

Aesthetic Mapping (aes()): The x-axis (time) and y-axis (log1p(value)) map the data points and fitted values.

Faceting: facet_wrap(~gene) ensures each gene gets its own panel.

Customization & Clarity: Color (black for raw points, darkred for fitted curve) enhances readability. Gridlines & Borders provide a polished appearance.

[Visual Redesign] a)

Here’s one of the visualizations I previously created, showing the yearly trends in “THREES Share” and “THREES FG%” for a dataset on shooting splits over several years.

The first plot shows the Yearly Threes Share over time. The second plot shows the Yearly Threes Made percentage over time.

# original code

shoot = read_csv("./Shooting Splits.csv")

mmdata = shoot %>%
        filter(YEAR != 2020) %>%
        rename(THREES_SPLIT = "THREES SHARE", MADE_THREES = "THREES FG%")
mmdata = mmdata %>%
       group_by(YEAR) %>%
       mutate(YEARLY_THREES_SHARE = sum(THREES_SPLIT)/n(), YEARLY_THREES_MADE = sum(MADE_THREES)/n())

ggplot(mmdata, aes(x = YEAR, y = YEARLY_THREES_SHARE)) + 
  geom_point() +
  geom_smooth()

ggplot(mmdata, aes(x = YEAR, y = YEARLY_THREES_MADE)) + 
  geom_point() +
  geom_smooth()

The data focuses on shooting performance, with a focus on the percentage of “threes” made and their share in a team’s performance each year. These are scatter plots with smoothing lines.

[Visual Redesign] b)

Main Takeaways: Both plots show the trend of three-point share and shooting percentage over time. We can infer how the performance has fluctuated yearly. The smoother lines provide a clearer trend despite the variation in individual years.

The Yearly Threes Share plot shows a general upward trend, implying that three-point shooting has become more prevalent over the years. The Yearly Threes Made plot shows fluctuations, with some years having a steep increase in shooting percentage, possibly pointing to specific player performance improvements.

Intended Message Consistency: The plots communicate the message of how shooting trends have evolved over time. However, they could be made clearer to show year-on-year comparisons of these two metrics in a more comparative way.

Hard-to-Highlight Comparisons: Comparing the two plots side-by-side is difficult when you want to see how the two metrics correlate. It would be better to merge the data for both aspects into a single visualization to compare the trends more easily.

[Visual Redesign] c)

Legibility of the Original Visualization:

Clarity of Data Points: While scatter plots can be useful, they may be difficult to read if there are too many overlapping points or if the trends aren’t clear enough. Here, the scatter points could be a bit noisy, making it harder to discern specific trends at a glance.

Lack of Comparison: There are two separate plots that display different metrics (threes share and threes made), making it harder to compare them directly.

Aesthetics: The use of scatter points and smoothing lines is fine, but it doesn’t fully highlight the correlation between the two metrics over the years. The two plots could be made more compact for easier comparison.

[Visual Redesign] d)

Proposed Redesign: For the redesign, I would propose merging these two visuals into a single plot with both metrics combined and enhanced for legibility and clarity. Here’s how I would proceed with the redesign:

Combine the Metrics into One Plot: Instead of showing two separate plots, I would create a line plot with two different colored lines (one for Yearly Threes Share and one for Yearly Threes Made) to allow for easier comparison.

Legibility Improvements:

Line Plot: Use lines instead of points for clearer trend visibility. Dual Y-Axes: If the scales for the two metrics differ significantly, using a dual y-axis could make the trends easier to compare while maintaining clarity. Color Coding: Use different colors to distinguish between the two metrics clearly (e.g., blue for threes share, orange for threes made). Annotations: Annotate key points where significant changes happen in the trends (e.g., sudden spikes in shooting performance) to give viewers key insights.

ggplot(mmdata, aes(x = YEAR)) + 
  geom_line(aes(y = YEARLY_THREES_SHARE, color = "Threes Share"), size = 1) +
  geom_line(aes(y = YEARLY_THREES_MADE, color = "Threes Made"), size = 1) +
  scale_color_manual(values = c("Threes Share" = "blue", "Threes Made" = "orange")) +
  labs(title = "Comparison of Yearly Threes Share and Threes Made",
       x = "Year",
       y = "Percentage (%)",
       color = "Metrics") +
  theme_minimal() +
  theme(legend.position = "bottom") 

Key Points in the New Design:

Visual Tasks: The main visual tasks are to highlight the trend over time, compare the yearly performance of the two metrics, and easily spot trends and key points in the data.

Trade-Offs: Using a dual line plot means sacrificing the individual point-level detail that the scatter plot provides. However, this trade-off enhances clarity in showing the overall trend and comparison between the two metrics.

Legibility Improvements:

Using line plots instead of scatter plots for smoother trend visualization. The addition of color and dual y-axes makes it easier to distinguish between the two metrics. The legend is moved to the bottom for clarity, making the plot less cluttered.

[California Wildfire Alternatives] a)

library(ggplot2)
library(dplyr)
library(readr)

# Load data
fires <- read_csv("https://uwmadison.box.com/shared/static/k5vvekf1bhh9e16qb9s66owygc70t7dm.csv") %>%
  select(Name, Counties, year, day_of_year, AcresBurned, MajorIncident)

# Order counties by total acres burned
county_order <- fires %>%
  group_by(Counties) %>%
  summarise(total_burned = sum(AcresBurned, na.rm = TRUE)) %>%
  arrange(total_burned) %>%
  pull(Counties)

# Create scatter plot with grey facet boxes
ggplot(fires, aes(x = day_of_year, y = factor(Counties, levels = county_order), size = log1p(AcresBurned))) +
  geom_point(color = "black", alpha = 0.7) +  # Ensure correct opacity and color
  facet_wrap(~year, nrow = 1, scales = "free_x") +  # Facet by year with a single row
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  labs(y = "County (Sorted by Acres Burned)", x = "Day of Year", size = "log(AcresBurned)") +
  theme(
    strip.background = element_rect(fill = "grey80", color = "black"),  # Grey background with black border
    strip.text = element_text(size = 16, face = "bold", color = "black"),  # Bold black text inside the grey box
    axis.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12, angle = 45, hjust = 1),  # Rotate x-axis labels to prevent overlap
    axis.text.y = element_text(size = 10),
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 14, face = "bold"),
    legend.position = "right",  # Position legend correctly
    panel.grid.major = element_line(color = "grey85"),  # Light grey gridlines
    panel.border = element_rect(color = "black", fill = NA, size = 1.2)  # Add a black border around the plot
  ) +
  scale_size_continuous(range = c(1, 6))  # Ensure correct bubble scaling
## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

Strengths of This Approach:

Weaknesses of This Approach:

[California Wildfire Alternatives] b)

# Create box plot with Major Incident color distinction
ggplot(fires, aes(x = factor(year), y = log1p(AcresBurned), fill = MajorIncident)) +
  geom_boxplot(outlier.shape = 21, outlier.color = "black", alpha = 0.7) +  # Ensure correct transparency and outliers
  scale_fill_manual(values = c("TRUE" = "#E66101", "FALSE" = "#5EBC89")) +  # Color matching the original image
  theme_minimal(base_size = 14) +  # Adjust text size for readability
  labs(y = "log(AcresBurned)", x = "Year", fill = "Major Incident?") +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1.2),  # Add black border
    axis.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 14),
    axis.text.y = element_text(size = 12),
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 14, face = "bold"),
    legend.position = "right"  # Ensure legend is placed correctly
  )
## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_boxplot()`).

Strengths of This Approach - Good for comparing wildfire severity across years – The boxplots summarize distributions of wildfire sizes each year, making it easy to see which years had more severe fires. - Highlights the impact of “Major Incidents” – The color coding allows for a quick visual differentiation between large-scale incidents and smaller fires.

Weaknesses of This Approach - Does not show wildfire seasonality – Unlike Approach 1, this does not show when fires happened during the year. - Hard to see individual fires – Boxplots summarize data, so specific details about individual fires (such as location or exact timing) are lost.

[California Wildfire Alternatives] c)

top_fires <- fires %>%
  arrange(desc(AcresBurned)) %>%
  slice_head(n = 15)

# Define the exact colors from the reference image
year_colors <- c(
  "2013" = "#8DD3C7",
  "2014" = "#FFFFB3",
  "2015" = "#BEBADA",
  "2016" = "#FB8072",
  "2017" = "#80B1D3",
  "2018" = "#FDB462",
  "2019" = "#B3DE69"
)

# Bar chart of largest fires with correct colors and alignment
ggplot(top_fires, aes(x = AcresBurned, y = reorder(Name, AcresBurned), fill = factor(year))) +
  geom_col() +
  scale_fill_manual(values = year_colors) +  # Apply correct colors
  scale_x_continuous(expand = c(0, 0)) +  # **Fix bar alignment to start from zero**
  theme_minimal(base_size = 14) +  
  labs(y = "Fire", x = "Acres Burned", fill = "Year", title = "Fires with the Most Acres Burned") +
  theme(
    panel.border = element_rect(color = "black", fill = NA, size = 1.2),  # Add black border
    axis.title = element_text(size = 16, face = "bold"),
    axis.text.x = element_text(size = 12),
    axis.text.y = element_text(size = 10),
    legend.text = element_text(size = 12),
    legend.title = element_text(size = 14, face = "bold"),
    legend.position = "right"  # Ensure legend is placed correctly
  )

Strengths of This Approach - Best for identifying the largest individual wildfires – This visualization clearly shows which fires were the most severe and allows comparison across years. - Year-based color coding enhances readability – Using different colors for each year helps differentiate fire events across time.

Weaknesses of This Approach - Not useful for comparing overall wildfire trends – This only shows the largest fires, so it doesn’t provide insight into all fires or how fire severity changes over time. - Does not show fire locations or seasonality – Unlike the other approaches, this focuses only on total burned area, ignoring when or where the fires occurred.

[California Wildfire Alternatives] d)

I did it for all 3 plots.

#install.packages("magick")  # Run this if you haven't installed magick
library(magick)

# Load and display the image
img <- image_read("screenshot1.png")
print(img)
## # A tibble: 1 × 7
##   format width height colorspace matte filesize density
##   <chr>  <int>  <int> <chr>      <lgl>    <int> <chr>  
## 1 PNG     2082   1626 sRGB       TRUE   2779407 57x57